data/raw-data/data2.txt: Used for the main analysis
data/raw-data/global burden disease region.xlsx: Gives countries for global regions used in the paper.
data/raw-data/owid-covid-data_220208_22c8de6.txt: Maybe the raw data
2.1 Explore the raw data
Find some information about the first data and use it to annotate or name it. It looks like only the file data/raw-data/data2.txt was used in the analysis.
Open Questions:
When was the data accessed?
Several variables have typos:
data_2: “Vccination21”
Check on basic descriptive statistics like missingness.
Show the code
# Hmisc function to quickly scan for NAs in datacontents(data_1)
data_1 Contents
Data frame:data_1
160237 observations and 67 variables, maximum # NAs:155098
Name
Class
Storage
NAs
iso_code
character
0
continent
character
0
location
character
0
date
IDate
integer
0
total_cases
double
2875
new_cases
double
2909
new_cases_smoothed
double
4060
total_deaths
double
20477
new_deaths
double
20310
new_deaths_smoothed
double
20440
total_cases_per_million
double
3607
new_cases_per_million
double
3641
new_cases_smoothed_per_million
double
4787
total_deaths_per_million
double
21196
new_deaths_per_million
double
21029
new_deaths_smoothed_per_million
double
21159
reproduction_rate
double
39346
icu_patients
double
137694
icu_patients_per_million
double
137694
hosp_patients
double
136956
hosp_patients_per_million
double
136956
weekly_icu_admissions
double
155098
weekly_icu_admissions_per_million
double
155098
weekly_hosp_admissions
double
149821
weekly_hosp_admissions_per_million
double
149821
new_tests
double
94096
total_tests
double
92909
total_tests_per_thousand
double
92909
new_tests_per_thousand
double
94096
new_tests_smoothed
double
78677
new_tests_smoothed_per_thousand
double
78677
positive_rate
double
83915
tests_per_case
double
84475
tests_units
character
0
total_vaccinations
double
117442
people_vaccinated
double
119464
people_fully_vaccinated
double
122214
total_boosters
double
145084
new_vaccinations
double
124688
new_vaccinations_smoothed
double
81607
total_vaccinations_per_hundred
double
117442
people_vaccinated_per_hundred
double
119464
people_fully_vaccinated_per_hundred
double
122214
total_boosters_per_hundred
double
145084
new_vaccinations_smoothed_per_million
double
81607
new_people_vaccinated_smoothed
double
82782
new_people_vaccinated_smoothed_per_hundred
double
82782
stringency_index
double
34430
population
double
1049
population_density
double
17606
median_age
double
27311
aged_65_older
double
28753
aged_70_older
double
28024
gdp_per_capita
double
26692
extreme_poverty
double
72237
cardiovasc_death_rate
double
28338
diabetes_prevalence
double
21427
female_smokers
double
57948
male_smokers
double
59416
handwashing_facilities
double
94165
hospital_beds_per_thousand
double
40958
life_expectancy
double
10666
human_development_index
double
28863
excess_mortality_cumulative_absolute
double
154777
excess_mortality_cumulative
double
154777
excess_mortality
double
154777
excess_mortality_cumulative_per_million
double
154777
Show the code
contents(data_2)
data_2 Contents
Data frame:data_2
109 observations and 41 variables, maximum # NAs:78
Name
Storage
NAs
location
character
0
GBDR7
character
0
Testing20
double
0
Incidence20
double
0
Mortality20
double
0
Vaccination20
double
78
Testing21
double
0
Incidence21
double
0
Mortality21
double
0
Vccination21
double
0
max20_21_tests
double
0
max20_21_cases
double
0
max20_21_deaths
double
0
max20_21_peopleVaccinated
double
0
Age
double
0
GE
double
0
VA
double
0
PS
double
0
RQ
double
0
RL
double
0
CC
double
0
LEB
double
0
EYS
double
0
MYS
double
0
GNI
double
0
Gini
double
0
Urban
double
0
CHE
double
0
MMR
integer
0
FLB
double
0
ABR
double
0
SSP
double
0
PSE
double
0
EP
double
0
VE
double
0
Population
integer
0
PD
double
0
AAG
double
0
GDP
double
0
GI
double
0
HDI
double
0
Show the code
contents(data_3)
data_3 Contents
Data frame:data_3
113 observations and 5 variables, maximum # NAs:6
Name
Storage
NAs
Country.Territory
character
0
Code
character
0
Region
character
0
GBDR 21
character
6
GBDR7
character
0
This describe() call would be nice but currently the rendering fails when loading qreport package.
Show the code
d <-describe(data_2)html(d, size =80, scroll =TRUE)
Summaries can also quickly be created using the movStats() function
Show the code
# A shorthand for calculating summaries using data.tablewriteLines("Number of tests 2020 per meta-region:")
Check the data sources. data_2 contains per-country information on testing, indidences, mortality etc.
Show the code
# Check number of locationswriteLines("Total number of countries:")
Total number of countries:
Show the code
data_2[, uniqueN(location)]
[1] 109
Show the code
# Number of entries per locationwriteLines("Countries per meta-region:")
Countries per meta-region:
Show the code
data_2[, .N, GBDR7]
GBDR7
N
Central Europe, Eastern Europe, and Central Asia
22
High-Income
31
Latin America and Caribbean
16
North Africa and Middle East
10
Southeast Asia, East Asia, and Oceania
12
Sub-Saharan Africa
18
Create a long-form version of the data and print summary information.
Show the code
# long format helps evaluate each variabledata_2_long <-melt(data_2, id.vars =c("location", "GBDR7")) |>suppressWarnings()writeLines("Variables and missings per meta-region:")
We have several indices, some are a combination of others, like the HDI or GI.
Show the code
all_indices <-c("HDI","LEB","EYS","MYS","GNI","GDP","CHE","GI","MMR","ABR","FLB","PSE","Gini","UP","PD","EP","VE","AAG")# I want the column names to match the proper index namesall_indices[!(all_indices %in%colnames(data_2))]
[1] "UP"
Show the code
# Correct typo in namesetnames(data_2, "Vccination21", "Vaccination21")# UPsetnames(x = data_2, "Urban", "UP")# I want to remember what indices are for what and how they are connectedindices_structured <-list(Wealth =list("HDI"=c("LEB", "EYS", "MYS", "GNI"), "GDP"=c(), "CHE"=c() ),Inequality =list("GI"=c("MMR", "ABR", "FLB", "PSE"),"Gini"=c() ),Demographic_SES =list("UP"=c(),"PD"=c(),"EP"=c(),"VE"=c() ),Governance =list("AAG"=c()),"Mortality"=c(),"Incidence"=c(),"Vaccination"=c(),"Test_Capacity"=c())
3.1 Index correlation
I can plot basic overviews of the predictor data. First plot correlation of all variables.
par(mfrow =c(6,3))options(grType ='base')for (i inseq_along(all_indices)) {# i <- 1hist(unlist(data_2[, .SD, .SDcols = all_indices[i]]),main = all_indices[i], xlab =NULL, ylab =NULL )}
Show the code
options(grType ='plotly')par(mfrow =c(1,1))
Create data for plotting.
Show the code
# For plotting by year I need to melt the data by yeardata_2_year <-melt( data_2, id.vars =c("location", "GBDR7"), measure.vars =c("Testing20", "Testing21"), variable.name ="Year", value.name ="Tests")data_2_year[, Year :=ifelse(Year =="Testing20", "2020", "2021")]# Get log of PD for plottingdata_2$log_PD <-log10(data_2$PD)# select my interesting indicesnci <-c("CHE", "UP", "log_PD", "EP", "GI", "VE")# Add the indicesm1 <-match(data_2_year$location, data_2$location)data_2_year[, c(nci, "Population") := data_2[(m1), .SD, .SDcols =c((nci), "Population")]]# Plot data with their meandata_2_plot <-copy(data_2_year)# Melt again to plot all indices at the same timedata_2_plot <-melt(data_2_plot, measure.vars = nci, variable.name ="Index", value.name ="Value", variable.factor =FALSE)
Per-region display to illustrate the effect difference within the regions.
Show the code
# Simpsons Paradox plot_list <-vector("list", length(nci))names(plot_list) <- ncifor (i inseq_along(names(plot_list))) { index <-names(plot_list)[i]# i <- "GI" p <-ggplot(data_2_plot[Index == (index)], aes(x = Value, y = Tests, col = GBDR7, size = Population) ) +geom_point() +facet_grid(Year ~ GBDR7) +scale_y_continuous(trans ="log10",labels =trans_format("log10",math_format(10^.x))) +annotation_logticks(sides ="l", outside = T) +coord_cartesian(clip ="off") +labs(y ="Testing", x = index) +guides(size ="none") +theme(legend.position ="bottom", legend.direction ="vertical",legend.justification ="left", axis.ticks.y =element_blank(),strip.text.x =element_blank()) +geom_smooth(method ="lm", aes(x = Value, y = Tests, group = GBDR7), col ="indianred4",formula ="y ~ x")# Log scale makes more sense for GDP# if (index == "GDP") {# # p <- p + scale_x_continuous(# trans = "log10",# labels = trans_format("log10",# math_format(10^.x)))# # }# Remove all but the last legend for plottingif (i !=length(names(plot_list))) { p <- p +theme(legend.position ="none") }# Add to list plot_list[[index]] <- p}# Extract the color legend to plot separatelylegend <-get_legend(# create some space to the left of the legendlast(plot_list))# Remove from the plot to only plot extracted plot_list[[6]] <- plot_list[[6]] +theme(legend.position ="none")plot_list[["Legend"]] <- legend# Plottingcowplot::plot_grid(plotlist = plot_list, ncol =1)
GI on a *10 scale makes the effect more interpretable (effect of changes +- 0.1 instead of +- 1)
Show the code
# Use log of PDdata_2$log_PD <-log10(data_2$PD)# compare raw with scaled datapar(mfrow =c(1,2))data_2[, plot(log2(Testing20) ~ PD)]
NULL
Show the code
data_2[, plot(log2(Testing20) ~ log_PD)]
NULL
Show the code
par(mfrow =c(1,1))# Rescale testing for overview, remove those laterdata_2$log_Testing20 <-log2(data_2$Testing20)data_2$log_Testing21 <-log2(data_2$Testing21)# Rescale the GI to get the effect estimates in terms of 0.1 increase of GIdata_2$GI_10 <- data_2$GI *10
“Since the FDR controls for the number of false positives among all rejected null hypotheses, it implicitly accounts for the inflating effect of type I error produced by multicollinearity(17)” –> No?
No analysis described
Separation 2020/2021 a little arbitrary
Where is the data dictionary?
All variables µ=0, sd=1
6.1 Used packages
Show the code
# Cleaninglibrary(openxlsx)library(tidyr)library(dplyr)# Old analysislibrary(psych)library(clusterSim)library(clustertend)# New Analysislibrary(clusterSim)library(jtools)library(parameters)library(see)library(performance)library(olsrr)library(MASS)library(mlbench)library(caret)library(ggstatsplot)library(jtools)library(ggstance)
6.2 Analysis script
This is the updated analysis script. I adapted it so it can run in this report.
Show the code
## datadata <-copy(data_2)## data per yeardata_sub <-as.matrix(data[, -c(1:2, 6:14, 26, 36)]) ## 2020# data_sub<-as.matrix(data[,-c(1:6,11:14,26,36)])##2021## variables# indep<-data_sub[,-c(2:3,25:28)]##disaggragated 2020indep <- data_sub[, -c(2:3, 5:14, 17:21)] ## aggregated 2020# indep<-data_sub[,-c(2:4,26:29)]##disaggragated 2021# indep<-data_sub[,-c(2:4,6:15,18:22)]##aggregated 2021## data normalizationlibrary(clusterSim)indep <-data.Normalization( indep, type ="n1",normalization ="column",na.rm =FALSE)indep <-as.data.frame(indep)############################################## FIRST MODEL### MLRlibrary(jtools)model <-lm(Testing20 ~ ., data = indep)smodel <-summary(model) # for rse and p## print parameterslibrary(parameters)pmodel <-model_parameters(model, summary =TRUE)write.csv2(pmodel, "param model.csv")## check mlr conditionslibrary(see)library(performance)windows(width =8, height =5)check_model(model)################################################## wls model### Weighted Least Square Regression# define weights to useweight <-1/lm(abs(model$residuals) ~ model$fitted.values)$fitted.values^2# perform weighted least squares regressionwls_model <-lm(Testing20 ~ ., data = indep, weights = weight)# Check the model summaryswls <-summary(wls_model)swlssumm(wls_model)pwls <-model_parameters(wls_model, summary =TRUE)write.csv2(wls_model, "param wls.csv")## check mlr conditionswindows(width =8, height =4)check_model(wls_model, check ="pp_check")## check multicolinearity (VIF)ols_vif_tol(wls_model)################################################### variable selection# option 1 stepwise selection by AIC criterialibrary(olsrr)a <-ols_step_both_aic(model)b <-ols_step_both_aic(wls_model)plot(b)## option2library(MASS)library(mlbench)library(caret)step <-stepAIC(model, direction ="both", trace =FALSE)step <-stepAIC(wls_model, direction ="both", trace =FALSE)step## option 3 forward selectionstep.model <-train( # Testing20 ~., data = indep, weights = weight, Testing20 ~ .,data = indep,method ="leapForward",tuneGrid =data.frame(nvmax =1:5),trControl = train.control)step.model$resultsstep.model$bestTunesummary(step.model$finalModel)### option 4 subsetsub <-ols_step_best_subset(model)sub <-ols_step_best_subset(wls_model)subplot(sub)########################################### model with selected variables## stepAIC and subset selectionmodel4 <-lm(Testing20 ~ GDP + PD + Urban + GI + HDI, data = indep, weights = weight)smodel4 <-summary(model4)smodel4summ(model4)pmodel4 <-model_parameters(model4, summary =TRUE)write.csv2(pmodel4, "param model4.csv")## check mlr conditionswindows(width =8, height =5)check_model(model4, check ="pp_check")## check multicolinearity (VIF)ols_vif_tol(model4)### forward selectionmodel5 <-lm(Testing20 ~ GDP + CHE + EP + AAG + HDI, data = indep, weights = weight)smodel5 <-summary(model5)smodel5summ(model5)pmodel5 <-model_parameters(model5, summary =TRUE)write.csv2(pmodel5, "param model5.csv")## check mlr conditionswindows(width =8, height =5)check_model(model5, check ="pp_check")## check multicolinearity (VIF)ols_vif_tol(model5)########################################################### visualization resultslibrary(ggstatsplot)windows(width =8, height =7)ggcoefstats(model)library(jtools)library(ggstance)windows(width =6, height =7)plot_summs(model, wls_model, model4, model5, omit.coefs =NULL) ## compare models## Extract the formula for each model# Extract coefficients and variable names from the modelcoefficients <-coef(model)# coefficients <- coef(wls_model)# coefficients <- coef(model4)# coefficients <- coef(model5)variable_names <-names(coefficients)[-1]# Create a simplified formula stringformula_string <-paste("Testing20 =", round(coefficients[1], 2), "+",paste(round(coefficients[-1], 2) *-1, variable_names, collapse =" + "))formula_string
6.3 Supplemental Code
Show the code
####################################################################################### 1. Preparing COVID-19 related data (Testing, mortality, incidence, vaccination)####################################################################################### Load Global Burden Disease Classificationlibrary(openxlsx)library(tidyr)library(dplyr)country_classification <-read.xlsx("C:/path/global burden disease region.xlsx")country_classification$GBDR7 <-as.factor(country_classification$GBDR7)### Load COVID-19 related dataOWID <-read.csv("C:/path/owid-covid-data_220208_22c8de6.txt",sep=",")### Clean COVID-19 related dataOWID$Month <- lubridate::ceiling_date(as.Date(OWID$date),unit="month")-1OWID$Year <-format(OWID$Month,"%Y")OWID_Monthly <- OWID %>% dplyr::group_by(iso_code,continent, location,Month,Year) %>% dplyr::summarise(max_total_tests_per_Month=max(total_tests,na.rm = T),max_total_cases_per_Month=max(total_cases,na.rm = T),max_total_deaths_per_Month=max(total_deaths,na.rm = T),max_total_peopleVaccinated_per_Month=max(people_vaccinated,na.rm = T),population_InMonth=median(population),median_age_InMonth=median(median_age),population_density_InMonth=median(population_density) ) %>%arrange(iso_code,Month) %>%filter(Month <as.Date("2022-01-01")) %>%filter(Month >=as.Date("2020-03-01")) %>%ungroup()# Remove -Inf induced by using max(na.rm = T)OWID_Monthly$max_total_tests_per_Month[OWID_Monthly$max_total_tests_per_Month==-Inf]=NAOWID_Monthly$max_total_cases_per_Month[OWID_Monthly$max_total_cases_per_Month==-Inf]=NAOWID_Monthly$max_total_deaths_per_Month[OWID_Monthly$max_total_deaths_per_Month==-Inf]=NAOWID_Monthly$max_total_peopleVaccinated_per_Month[OWID_Monthly$max_total_peopleVaccinated_per_Month==-Inf]=NA# Fill Missing Months with NAsOWID_MonthlyExtended <- OWID_Monthly %>%complete(nesting(location,iso_code,continent),Month) # Find Months with missing EntriesOWID_MonthlyExtended$containsNA <-apply(OWID_MonthlyExtended[,c("max_total_tests_per_Month","max_total_cases_per_Month","max_total_deaths_per_Month")],1,function(y){any(is.na(y))})# Countries with values for at least 50% of MonthscountriesWithValidCaseTestDeathData <- OWID_MonthlyExtended %>%group_by(iso_code,continent,location) %>%summarise(missing=sum(containsNA),total =n()) %>%filter(missing/total<0.5) %>%ungroup()# Calculate Values per Month from cumulative Values for Countries matching the number of minimal required datapointsOWID_explicit_Monthly <- OWID_Monthly %>%filter(iso_code %in% countriesWithValidCaseTestDeathData$iso_code)# List Countries with missing Region (External Data)OWID_explicit_Monthly_Region_Missing_External_Data <-left_join(OWID_explicit_Monthly, country_classification, by=c("iso_code"="Code")) %>%filter(is.na(Region))# Get monthly data for countries with RegionOWID_explicid_Monthly_Region <-left_join(OWID_explicit_Monthly, country_classification, by=c("iso_code"="Code")) %>%filter(!is.na(Region))OWID_cumulative_Values_End2020<- OWID_explicid_Monthly_Region %>%filter(Year=="2020") %>%group_by(iso_code) %>%summarise(max20_tests =max(max_total_tests_per_Month,na.rm=T),max20_cases =max(max_total_cases_per_Month,na.rm=T),max20_deaths =max(max_total_deaths_per_Month,na.rm=T),max20_peopleVaccinated =max(max_total_peopleVaccinated_per_Month,na.rm=T) ) %>%mutate(Year2substract="2021")OWID_cumulative_Values_End2020$max20_tests[OWID_cumulative_Values_End2020$max20_tests==-Inf] =NAOWID_cumulative_Values_End2020$max20_cases[OWID_cumulative_Values_End2020$max20_cases==-Inf] =NAOWID_cumulative_Values_End2020$max20_deaths[OWID_cumulative_Values_End2020$max20_deaths==-Inf] =NAOWID_cumulative_Values_End2020$max20_peopleVaccinated[OWID_cumulative_Values_End2020$max20_peopleVaccinated==-Inf] =NAOWID_cumulative_Values_End2021<- OWID_explicid_Monthly_Region %>%filter(Year=="2021") %>%group_by(iso_code) %>%summarise(max20_21_tests =max(max_total_tests_per_Month,na.rm=T),max20_21_cases =max(max_total_cases_per_Month,na.rm=T),max20_21_deaths =max(max_total_deaths_per_Month,na.rm=T),max20_21_peopleVaccinated =max(max_total_peopleVaccinated_per_Month,na.rm=T) )regionMeta <- OWID_explicid_Monthly_Region %>%select(iso_code,location, GBDR7,population_InMonth,median_age_InMonth,population_density_InMonth) %>%distinct()YearsMax <-full_join(OWID_cumulative_Values_End2020,OWID_cumulative_Values_End2021,by=c("iso_code"="iso_code")) %>%mutate(max21_tests=max20_21_tests-coalesce(max20_tests,0),max21_cases=max20_21_cases-coalesce(max20_cases,0),max21_deaths=max20_21_deaths-coalesce(max20_deaths,0),max21_peopleVaccinated=max20_21_peopleVaccinated-coalesce(max20_peopleVaccinated,0) ) %>%left_join(regionMeta, by=c("iso_code"="iso_code")) %>%mutate(max21_tests_Per100k = max21_tests/population_InMonth*100000,max21_cases_Per100k = max21_cases/population_InMonth*100000,max21_deaths_Per100k = max21_deaths/population_InMonth*100000,max21_peopleVaccinated_Per100k = max21_peopleVaccinated/population_InMonth*100000,max20_tests_Per100k = max20_tests/population_InMonth*100000,max20_cases_Per100k = max20_cases/population_InMonth*100000,max20_deaths_Per100k = max20_deaths/population_InMonth*100000,max20_peopleVaccinated_Per100k = max20_peopleVaccinated/population_InMonth*100000, )########################################################## 2. Data analysis######################################################## Load clean dataset including all variablesdata <-read.csv("C:/path/data2.txt",sep="\t")## select data per year#c<- as.matrix(data[,-c(1:2,6:14,16:21,32,36)])## data for 2020c<-as.matrix(data[,-c(1:6,11:14,16:21,32,36)])## data for 2021## Calculate spearman correlation and false discovery rate#r and p-values for all regionslibrary(psych)allw <-corr.test(c, method="spearman", adjust="fdr")p<-print(allw$p,quote=FALSE)#p-valuer<-print(allw$r)#r-value#r and p-values for each GBD regionregion<-subset(data, GBDR7 =="Latin America and Caribbean", drop=FALSE)#region<-subset(data, GBDR7 == "High-Income", drop=FALSE)#region<-subset(data, GBDR7 == "Sub-Saharan Africa", drop=FALSE)#region<-subset(data, GBDR7 == "Central Europe, Eastern Europe, and Central Asia", drop=FALSE)#region<-subset(data, GBDR7 == "North Africa and Middle East", drop=FALSE)#region<-subset(data, GBDR7 == "Southeast Asia, East Asia, and Oceania", drop=FALSE)#region <- as.matrix(region[-c(1:2,6:14,16:21,32,36)])# data for 2020region <-as.matrix(region[,-c(1:6,11:14,16:21,32,36)])# data for 2021region <-corr.test(region, method="spearman", adjust="fdr")p<-print(region$p,quote=FALSE)#p-valuer<-print(region$r)#r-value## Data normalization for PCAlibrary(clusterSim)df.n<-data.Normalization(c, type="n1", normalization="column")rownames(df.n)<-data$location## Performing PCApca<-prcomp(df.n)summary (pca)## selection of significant axess## Assessing clustering tendency on PCA resultspca<-prcomp(df.n, center=TRUE, scale.=FALSE, rank. =5)results <- pca$x #first five axislibrary(clustertend)hopkins(df.n, n=nrow(results)-1) # H<0.5 non clusterable